{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "# 数据清洗"
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "# 前两行画图\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 数据处理\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# 系统库\n",
    "import os, sys\n",
    "\n",
    "# 自带数据\n",
    "datalib_path = os.path.join(os.path.abspath('.'), '../')\n",
    "sys.path.append(datalib_path)\n",
    "import dataset"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "source": [
    "# load data\n",
    "train = pd.read_csv(os.path.join(dataset.titanic_path, 'train.csv'))\n",
    "test = pd.read_csv(os.path.join(dataset.titanic_path, 'test.csv'))"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "source": [
    "# 转置\n",
    "train.T"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>881</th>\n",
       "      <th>882</th>\n",
       "      <th>883</th>\n",
       "      <th>884</th>\n",
       "      <th>885</th>\n",
       "      <th>886</th>\n",
       "      <th>887</th>\n",
       "      <th>888</th>\n",
       "      <th>889</th>\n",
       "      <th>890</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>PassengerId</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>...</td>\n",
       "      <td>882</td>\n",
       "      <td>883</td>\n",
       "      <td>884</td>\n",
       "      <td>885</td>\n",
       "      <td>886</td>\n",
       "      <td>887</td>\n",
       "      <td>888</td>\n",
       "      <td>889</td>\n",
       "      <td>890</td>\n",
       "      <td>891</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Survived</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Pclass</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Name</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>Moran, Mr. James</td>\n",
       "      <td>McCarthy, Mr. Timothy J</td>\n",
       "      <td>Palsson, Master. Gosta Leonard</td>\n",
       "      <td>Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)</td>\n",
       "      <td>Nasser, Mrs. Nicholas (Adele Achem)</td>\n",
       "      <td>...</td>\n",
       "      <td>Markun, Mr. Johann</td>\n",
       "      <td>Dahlberg, Miss. Gerda Ulrika</td>\n",
       "      <td>Banfield, Mr. Frederick James</td>\n",
       "      <td>Sutehall, Mr. Henry Jr</td>\n",
       "      <td>Rice, Mrs. William (Margaret Norton)</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Sex</td>\n",
       "      <td>male</td>\n",
       "      <td>female</td>\n",
       "      <td>female</td>\n",
       "      <td>female</td>\n",
       "      <td>male</td>\n",
       "      <td>male</td>\n",
       "      <td>male</td>\n",
       "      <td>male</td>\n",
       "      <td>female</td>\n",
       "      <td>female</td>\n",
       "      <td>...</td>\n",
       "      <td>male</td>\n",
       "      <td>female</td>\n",
       "      <td>male</td>\n",
       "      <td>male</td>\n",
       "      <td>female</td>\n",
       "      <td>male</td>\n",
       "      <td>female</td>\n",
       "      <td>female</td>\n",
       "      <td>male</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Age</td>\n",
       "      <td>22</td>\n",
       "      <td>38</td>\n",
       "      <td>26</td>\n",
       "      <td>35</td>\n",
       "      <td>35</td>\n",
       "      <td>NaN</td>\n",
       "      <td>54</td>\n",
       "      <td>2</td>\n",
       "      <td>27</td>\n",
       "      <td>14</td>\n",
       "      <td>...</td>\n",
       "      <td>33</td>\n",
       "      <td>22</td>\n",
       "      <td>28</td>\n",
       "      <td>25</td>\n",
       "      <td>39</td>\n",
       "      <td>27</td>\n",
       "      <td>19</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>SibSp</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Parch</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Ticket</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>113803</td>\n",
       "      <td>373450</td>\n",
       "      <td>330877</td>\n",
       "      <td>17463</td>\n",
       "      <td>349909</td>\n",
       "      <td>347742</td>\n",
       "      <td>237736</td>\n",
       "      <td>...</td>\n",
       "      <td>349257</td>\n",
       "      <td>7552</td>\n",
       "      <td>C.A./SOTON 34068</td>\n",
       "      <td>SOTON/OQ 392076</td>\n",
       "      <td>382652</td>\n",
       "      <td>211536</td>\n",
       "      <td>112053</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>111369</td>\n",
       "      <td>370376</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Fare</td>\n",
       "      <td>7.25</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>7.925</td>\n",
       "      <td>53.1</td>\n",
       "      <td>8.05</td>\n",
       "      <td>8.4583</td>\n",
       "      <td>51.8625</td>\n",
       "      <td>21.075</td>\n",
       "      <td>11.1333</td>\n",
       "      <td>30.0708</td>\n",
       "      <td>...</td>\n",
       "      <td>7.8958</td>\n",
       "      <td>10.5167</td>\n",
       "      <td>10.5</td>\n",
       "      <td>7.05</td>\n",
       "      <td>29.125</td>\n",
       "      <td>13</td>\n",
       "      <td>30</td>\n",
       "      <td>23.45</td>\n",
       "      <td>30</td>\n",
       "      <td>7.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Cabin</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C85</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C123</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>E46</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>B42</td>\n",
       "      <td>NaN</td>\n",
       "      <td>C148</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>Embarked</td>\n",
       "      <td>S</td>\n",
       "      <td>C</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>Q</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>C</td>\n",
       "      <td>...</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>Q</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>S</td>\n",
       "      <td>C</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>12 rows × 891 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 0    \\\n",
       "PassengerId                        1   \n",
       "Survived                           0   \n",
       "Pclass                             3   \n",
       "Name         Braund, Mr. Owen Harris   \n",
       "Sex                             male   \n",
       "Age                               22   \n",
       "SibSp                              1   \n",
       "Parch                              0   \n",
       "Ticket                     A/5 21171   \n",
       "Fare                            7.25   \n",
       "Cabin                            NaN   \n",
       "Embarked                           S   \n",
       "\n",
       "                                                           1    \\\n",
       "PassengerId                                                  2   \n",
       "Survived                                                     1   \n",
       "Pclass                                                       1   \n",
       "Name         Cumings, Mrs. John Bradley (Florence Briggs Th...   \n",
       "Sex                                                     female   \n",
       "Age                                                         38   \n",
       "SibSp                                                        1   \n",
       "Parch                                                        0   \n",
       "Ticket                                                PC 17599   \n",
       "Fare                                                   71.2833   \n",
       "Cabin                                                      C85   \n",
       "Embarked                                                     C   \n",
       "\n",
       "                                2    \\\n",
       "PassengerId                       3   \n",
       "Survived                          1   \n",
       "Pclass                            3   \n",
       "Name         Heikkinen, Miss. Laina   \n",
       "Sex                          female   \n",
       "Age                              26   \n",
       "SibSp                             0   \n",
       "Parch                             0   \n",
       "Ticket             STON/O2. 3101282   \n",
       "Fare                          7.925   \n",
       "Cabin                           NaN   \n",
       "Embarked                          S   \n",
       "\n",
       "                                                      3    \\\n",
       "PassengerId                                             4   \n",
       "Survived                                                1   \n",
       "Pclass                                                  1   \n",
       "Name         Futrelle, Mrs. Jacques Heath (Lily May Peel)   \n",
       "Sex                                                female   \n",
       "Age                                                    35   \n",
       "SibSp                                                   1   \n",
       "Parch                                                   0   \n",
       "Ticket                                             113803   \n",
       "Fare                                                 53.1   \n",
       "Cabin                                                C123   \n",
       "Embarked                                                S   \n",
       "\n",
       "                                  4                 5    \\\n",
       "PassengerId                         5                 6   \n",
       "Survived                            0                 0   \n",
       "Pclass                              3                 3   \n",
       "Name         Allen, Mr. William Henry  Moran, Mr. James   \n",
       "Sex                              male              male   \n",
       "Age                                35               NaN   \n",
       "SibSp                               0                 0   \n",
       "Parch                               0                 0   \n",
       "Ticket                         373450            330877   \n",
       "Fare                             8.05            8.4583   \n",
       "Cabin                             NaN               NaN   \n",
       "Embarked                            S                 Q   \n",
       "\n",
       "                                 6                               7    \\\n",
       "PassengerId                        7                               8   \n",
       "Survived                           0                               0   \n",
       "Pclass                             1                               3   \n",
       "Name         McCarthy, Mr. Timothy J  Palsson, Master. Gosta Leonard   \n",
       "Sex                             male                            male   \n",
       "Age                               54                               2   \n",
       "SibSp                              0                               3   \n",
       "Parch                              0                               1   \n",
       "Ticket                         17463                          349909   \n",
       "Fare                         51.8625                          21.075   \n",
       "Cabin                            E46                             NaN   \n",
       "Embarked                           S                               S   \n",
       "\n",
       "                                                           8    \\\n",
       "PassengerId                                                  9   \n",
       "Survived                                                     1   \n",
       "Pclass                                                       3   \n",
       "Name         Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)   \n",
       "Sex                                                     female   \n",
       "Age                                                         27   \n",
       "SibSp                                                        0   \n",
       "Parch                                                        2   \n",
       "Ticket                                                  347742   \n",
       "Fare                                                   11.1333   \n",
       "Cabin                                                      NaN   \n",
       "Embarked                                                     S   \n",
       "\n",
       "                                             9    ...                 881  \\\n",
       "PassengerId                                   10  ...                 882   \n",
       "Survived                                       1  ...                   0   \n",
       "Pclass                                         2  ...                   3   \n",
       "Name         Nasser, Mrs. Nicholas (Adele Achem)  ...  Markun, Mr. Johann   \n",
       "Sex                                       female  ...                male   \n",
       "Age                                           14  ...                  33   \n",
       "SibSp                                          1  ...                   0   \n",
       "Parch                                          0  ...                   0   \n",
       "Ticket                                    237736  ...              349257   \n",
       "Fare                                     30.0708  ...              7.8958   \n",
       "Cabin                                        NaN  ...                 NaN   \n",
       "Embarked                                       C  ...                   S   \n",
       "\n",
       "                                      882                            883  \\\n",
       "PassengerId                           883                            884   \n",
       "Survived                                0                              0   \n",
       "Pclass                                  3                              2   \n",
       "Name         Dahlberg, Miss. Gerda Ulrika  Banfield, Mr. Frederick James   \n",
       "Sex                                female                           male   \n",
       "Age                                    22                             28   \n",
       "SibSp                                   0                              0   \n",
       "Parch                                   0                              0   \n",
       "Ticket                               7552               C.A./SOTON 34068   \n",
       "Fare                              10.5167                           10.5   \n",
       "Cabin                                 NaN                            NaN   \n",
       "Embarked                                S                              S   \n",
       "\n",
       "                                884                                   885  \\\n",
       "PassengerId                     885                                   886   \n",
       "Survived                          0                                     0   \n",
       "Pclass                            3                                     3   \n",
       "Name         Sutehall, Mr. Henry Jr  Rice, Mrs. William (Margaret Norton)   \n",
       "Sex                            male                                female   \n",
       "Age                              25                                    39   \n",
       "SibSp                             0                                     0   \n",
       "Parch                             0                                     5   \n",
       "Ticket              SOTON/OQ 392076                                382652   \n",
       "Fare                           7.05                                29.125   \n",
       "Cabin                           NaN                                   NaN   \n",
       "Embarked                          S                                     Q   \n",
       "\n",
       "                               886                           887  \\\n",
       "PassengerId                    887                           888   \n",
       "Survived                         0                             1   \n",
       "Pclass                           2                             1   \n",
       "Name         Montvila, Rev. Juozas  Graham, Miss. Margaret Edith   \n",
       "Sex                           male                        female   \n",
       "Age                             27                            19   \n",
       "SibSp                            0                             0   \n",
       "Parch                            0                             0   \n",
       "Ticket                      211536                        112053   \n",
       "Fare                            13                            30   \n",
       "Cabin                          NaN                           B42   \n",
       "Embarked                         S                             S   \n",
       "\n",
       "                                                  888                    889  \\\n",
       "PassengerId                                       889                    890   \n",
       "Survived                                            0                      1   \n",
       "Pclass                                              3                      1   \n",
       "Name         Johnston, Miss. Catherine Helen \"Carrie\"  Behr, Mr. Karl Howell   \n",
       "Sex                                            female                   male   \n",
       "Age                                               NaN                     26   \n",
       "SibSp                                               1                      0   \n",
       "Parch                                               2                      0   \n",
       "Ticket                                     W./C. 6607                 111369   \n",
       "Fare                                            23.45                     30   \n",
       "Cabin                                             NaN                   C148   \n",
       "Embarked                                            S                      C   \n",
       "\n",
       "                             890  \n",
       "PassengerId                  891  \n",
       "Survived                       0  \n",
       "Pclass                         3  \n",
       "Name         Dooley, Mr. Patrick  \n",
       "Sex                         male  \n",
       "Age                           32  \n",
       "SibSp                          0  \n",
       "Parch                          0  \n",
       "Ticket                    370376  \n",
       "Fare                        7.75  \n",
       "Cabin                        NaN  \n",
       "Embarked                       Q  \n",
       "\n",
       "[12 rows x 891 columns]"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "source": [
    "# 清除重复行\n",
    "print(train.shape)\n",
    "train_dedup = train.drop_duplicates()\n",
    "print(train_dedup.shape)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(891, 12)\n",
      "(891, 12)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "source": [
    "df = pd.DataFrame([\n",
    "    [1,1,1],\n",
    "    [2,2,2],\n",
    "    [3,3,3],\n",
    "    [1,1,1]\n",
    "])"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "source": [
    "print(df.shape)\n",
    "df_dedup = df.drop_duplicates()\n",
    "print(df_dedup.shape)"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "(4, 3)\n",
      "(3, 3)\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.6.8 64-bit ('base': conda)"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "interpreter": {
   "hash": "1fef96a86254b5b64b7294805a674893d583399788ea545149c5bfbe00efcc65"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}